This assignment uses a small subset of the data from Kaggle's Yelp Business Rating Prediction competition.
Description of the data:
yelp.json
is the original format of the file. yelp.csv
contains the same data, in a more convenient format. Both of the files are in this repo, so there is no need to download the data from the Kaggle website.
In [5]:
# access yelp.csv using a relative path
import pandas as pd
import seaborn as sns
yelp = pd.read_csv('C:/Users/Joshuaw/Documents/GA_Data_Science/data/yelp.csv')
yelp.head()
Out[5]:
In [11]:
# read the data from yelp.json into a list of rows
# each row is decoded into a dictionary using using json.loads()
import json
import pandas as pd
import seaborn as sns
with open('C:/Users/Joshuaw/Documents/GA_Data_Science/data/yelp.json', 'rU') as f:
data = [json.loads(row) for row in f]
In [3]:
# show the first review
data[0]
Out[3]:
In [10]:
# convert the list of dictionaries to a DataFrame
#note that the cool, funny, and useful vote types were in a nested dictionary: u'votes': {u'cool': 2, u'funny': 0, u'useful': 5}
yelp = pd.DataFrame(data)
yelp.head(2)
Out[10]:
In [12]:
# add DataFrame columns for cool, useful, and funny, do this by creating a new column for each column
#use list comprehension to pass the nested structure, remember it's as rows, e.g. row['votes]['cool]
yelp['cool'] = [row['votes']['cool'] for row in data]
yelp['useful'] = [row['votes']['useful'] for row in data]
yelp['funny'] = [row['votes']['funny'] for row in data]
In [13]:
# drop the votes column
#since votes was a nested data type votes:cool, votes:funny, votes:useful, the votes column has all types of vote data
#making the data not so useful so drop it, remember axis=1 indicates that we want to drop a particular column, we would use
#axis=0 to drop a specific row
yelp.drop('votes', axis=1, inplace=True)
yelp.head(1)
Out[13]:
In [7]:
# treat stars as a categorical variable and look for differences between groups
yelp.groupby('stars').mean()
Out[7]:
In [39]:
# correlation matrix
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
#create correlation matrix
corr = yelp.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 8))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3,
square=True, linewidths=.5,
cbar_kws={"shrink": .5})
#sns.heatmap(yelp.corr())
Out[39]:
In [38]:
# multiple scatter plots
sns.set(style="white")
sns.pairplot(yelp, x_vars=['cool', 'useful', 'funny'], y_vars='stars', size=6, aspect=0.7, kind='reg', markers = '+')
Out[38]:
In [40]:
feature_cols = ['cool', 'useful', 'funny']
x = yelp[feature_cols]
y = yelp.stars
#feature_cols = ['cool', 'useful', 'funny']
#X = yelp[feature_cols]
#y = yelp.stars
In [49]:
#instantiate the linear regression model
from sklearn.linear_model import LinearRegression
#store the regression model function as a variable called 'linreg'
#we specify that we want to fit a model that includes an intercept, because we don't assume the data has been centered
linreg = LinearRegression(fit_intercept=True)
#fit a linear regression model
linreg.fit(x, y)
#find coeffecient of determination aka R2 and store it as a variable
#result = 0.044
r2 = linreg.score(x, y, sample_weight=None)
#create a list of lists that shows the independent variables and their coeffecients
#note: need to see whether this is a b weight or a beta weight and how to look at significance of the coeffecients
coef = zip(feature_cols, linreg.coef_)
print(r2)
print(coef)
print(linreg.intercept_)
#overall votes is a poor predictor of stars, sentiment is likely to be a better predictor
In [53]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import numpy as np
In [54]:
# define a function that accepts a list of features and returns testing RMSE
def train_test_rmse(feature_cols):
X = yelp[feature_cols]
y = yelp.stars
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
return np.sqrt(metrics.mean_squared_error(y_test, y_pred))
In [55]:
# calculate RMSE with all three features
train_test_rmse(['cool', 'useful', 'funny'])
Out[55]:
In [15]:
print train_test_rmse(['cool', 'useful'])
print train_test_rmse(['cool', 'funny'])
print train_test_rmse(['useful', 'funny'])
In [57]:
# new feature: review length (number of characters)
yelp['length'] = yelp.text.apply(len)
In [58]:
# new features: whether or not the review contains 'love' or 'hate'
yelp['love'] = yelp.text.str.contains('love', case=False).astype(int)
yelp['hate'] = yelp.text.str.contains('hate', case=False).astype(int)
In [59]:
# add new features to the model and calculate RMSE
train_test_rmse(['cool', 'useful', 'funny', 'length', 'love', 'hate'])
Out[59]:
In [61]:
# split the data (outside of the function)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
In [62]:
# create a NumPy array with the same shape as y_test
y_null = np.zeros_like(y_test, dtype=float)
In [63]:
# fill the array with the mean of y_test
y_null.fill(y_test.mean())
In [67]:
# calculate null RMSE
import numpy as np
print np.sqrt(metrics.mean_squared_error(y_test, y_null))
In [68]:
# import and instantiate KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
In [69]:
# classification models will automatically treat the response value (1/2/3/4/5) as unordered categories
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)
In [73]:
# use linear regression to make continuous predictions
linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_pred = linreg.predict(x_test)
In [74]:
# round its predictions to the nearest integer
y_pred_class = y_pred.round()
In [75]:
# calculate classification accuracy of the rounded predictions
print metrics.accuracy_score(y_test, y_pred_class)
In [ ]: